# Import python packages
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import random as rd
import pymc3 as pm
import theano as Th
import theano.tensor as tt
from matplotlib import pyplot as plt
import arviz as az
from IPython.display import display, Math, Latex
# Helper functions
def stdize(x):
return (x-np.mean(x))/np.std(x)
def indexall(L):
poo = []
for p in L:
if not p in poo:
poo.append(p)
Ix = np.array([poo.index(p) for p in L])
return poo,Ix
data = pd.read_csv('data/processedData.csv')
data.head()
Extract data as numpy arrays.
FIPS = data['FIPS']
nFIPS = len(FIPS)
CP = data['casesPerCap']
V = data['volatility']
SW = data['bidenSwing']
W = data['propWhite']
P = data['margin2016']
sData = data['state']
cData = data['FIPS']
# Grouping by state to get governance for each state.
grouped_df = data.groupby("state")
first_values = grouped_df.first()
G = first_values['governanceScore']
State, Is = indexall(sData)
County, Ic = indexall(cData)
Looking at the distributions of processedData.csv
_, ax = plt.subplots(1,5, figsize=(20,4))
ax[0].hist(CP)
ax[0].set_xlabel('CP', fontsize=17)
ax[0].set_ylabel('Count', fontsize=17)
ax[1].hist(V)
ax[1].set_xlabel('V', fontsize=17)
ax[2].hist(SW)
ax[2].set_xlabel('SW', fontsize=17)
ax[3].hist(W)
ax[3].set_xlabel('W', fontsize=17)
ax[4].hist(P)
ax[4].set_xlabel('P', fontsize=17)
plt.show()
Some more data processing.
# Log transform cases per capita
logCP = np.log(CP)
# Standardize all covariates except governance and minority proportion
logCP = stdize(logCP)
V = stdize(V)
SW = stdize(SW)
P = stdize(P)
M = 1-W
Plot distributions of processed data.
_, ax = plt.subplots(ncols=3, nrows=2, figsize=(25,12))
ax[0][0].hist(logCP)
ax[0][0].set_xlabel('logCP', fontsize=30)
ax[0][0].set_ylabel('Count', fontsize=30)
ax[0][1].hist(V)
ax[0][1].set_xlabel('V', fontsize=30)
ax[0][2].hist(SW)
ax[0][2].set_xlabel('SW', fontsize=30)
ax[1][0].hist(M)
ax[1][0].set_xlabel('M', fontsize=30)
ax[1][1].hist(P)
ax[1][1].set_xlabel('P', fontsize=30)
ax[1][2].hist(G)
ax[1][2].set_xlabel('G', fontsize=30)
plt.savefig('figs/dataDistribution.png')
plt.show()
with pm.Model(coords = {'State': State, 'County': FIPS}) as hierarchical_intercept:
# Hyperpriors for intercept
g = pm.Normal("g", mu=0.0, sigma=3.5, shape=2)
sigma_a = pm.Exponential("sigma_a", 1.0)
# hyperpriors for slopes
mu_c = pm.Normal("mu_c", mu=0.0, sigma=0.4)
sigma_c = pm.Exponential("sigma_c", 1)
mu_m = pm.Normal('mu_m', 0, 0.4)
sigma_m = pm.Exponential('sigma_m', 1)
mu_v = pm.Normal('mu_v', 0, 0.4)
sigma_v = pm.Exponential('sigma_v', 1)
mu_p = pm.Normal('mu_p', 0, 0.4)
sigma_p = pm.Exponential('sigma_p', 1)
# Varying intercepts (state):
mu_a = pm.Deterministic("a", g[0] + g[1] * G, dims="State")
za_state = pm.Normal("za_state", mu=0.0, sigma=1.0, dims="State")
a_state = pm.Deterministic("a_state", mu_a + za_state * sigma_a, dims="State")
#varying slopes (county)
zb_c = pm.Normal("zb_c", mu=0.0, sigma=0.4, dims="County")
b_c = pm.Deterministic("b_c", mu_c + zb_c * sigma_c, dims="County")
zb_m = pm.Normal("zb_m", mu=0.0, sigma=0.4, dims="County")
b_m = pm.Deterministic("b_m", mu_m + zb_m * sigma_m, dims="County")
zb_v = pm.Normal("zb_v", mu=0.0, sigma=0.4, dims="County")
b_v = pm.Deterministic("b_v", mu_v + zb_v * sigma_v, dims="County")
zb_p = pm.Normal("zb_p", mu=0.0, sigma=0.4, dims="County")
b_p = pm.Deterministic("b_p", mu_p + zb_p * sigma_p, dims="County")
# PyMC3 Data objects for plotting later
Ic_ = pm.intX(pm.Data("Ic", Ic))
P_ = pm.Data("P", P)
# Expected value per county:
theta = pm.Deterministic('theta', a_state[Is] + b_c * CP + b_m * M + b_v * V + b_p * P)
# Model error:
sigma = pm.Exponential("sigma", 1.0)
y = pm.Normal("y", theta, sigma=sigma, observed=SW, dims = "County")
ppd = pm.sample_prior_predictive(200, model=hierarchical_intercept)
_, ax = plt.subplots(2,2, figsize=(10,10))
xnew = np.linspace(-5, 5, 50)
ax[0,0].scatter(logCP, SW)
[ax[0,0].plot(xnew, b0+b1*xnew, alpha=0.3, c='black') for b0,b1 in zip(ppd['a_state'].T[1], ppd['b_c'].T[1])]
#ax[0,0].set_title('logCOVID vs Biden swing', fontsize=16)
ax[0,0].axhline(np.max(SW), color='r')
ax[0,0].axhline(np.min(SW), color='r')
ax[0,0].set_ylim(-20, 20)
ax[0,0].set_xlabel('log(Cases per Capita)', fontsize=17)
ax[0,1].scatter(M, SW)
[ax[0,1].plot(xnew, b0+b1*xnew, alpha=0.3, c='black') for b0,b1 in zip(ppd['a_state'].T[1], ppd['b_m'].T[1])]
#ax[0,1].set_title('Minorities vs Biden swing', fontsize=16)
ax[0,1].axhline(np.max(SW), color='r')
ax[0,1].axhline(np.min(SW), color='r')
ax[0,1].set_ylim(-20, 20)
ax[0,1].set_xlabel('Proportion Minorities', fontsize=17)
ax[1,0].scatter(V, SW)
[ax[1,0].plot(xnew, b0+b1*xnew, alpha=0.3, c='black') for b0,b1 in zip(ppd['a_state'].T[1],ppd['b_v'].T[1])]
#ax[1,0].set_title('Volatility vs Biden swing', fontsize=16)
ax[1,0].axhline(np.max(SW), color='r')
ax[1,0].axhline(np.min(SW), color='r')
ax[1,0].set_ylim(-20, 20)
ax[1,0].set_xlabel('Volatility', fontsize=17)
ax[1,1].scatter(P, SW)
[ax[1,1].plot(xnew, b0+b1*xnew, alpha=0.3, c='black') for b0,b1 in zip(ppd['a_state'].T[1],ppd['b_p'].T[1])]
#ax[1,1].set_title('Presidential2016 vs Biden swing', fontsize=16)
ax[1,1].axhline(np.max(SW), color='r')
ax[1,1].axhline(np.min(SW), color='r')
ax[1,1].set_ylim(-20, 20)
ax[1,1].set_xlabel('2016 Presidential Margin', fontsize=17)
ax[0,0].set_ylabel('Biden Swing', fontsize=17)
ax[1,0].set_ylabel('Biden Swing', fontsize=17)
plt.tight_layout()
plt.savefig('figs/countyPrior.png')
plt.show()
with hierarchical_intercept:
hierarchical_intercept_idata = pm.sample(1000, target_accept = 0.99)
#2000, tune=2000, target_accept=0.99)
summary = pm.summary(hierarchical_intercept_idata, var_names=['g', 'mu_c', 'mu_m', 'mu_v', 'mu_p'])
summary
az.plot_trace(hierarchical_intercept_idata, var_names= ['g', 'mu_c', 'mu_m', 'mu_v', 'mu_p', 'sigma'])
plt.show()
az.plot_posterior(hierarchical_intercept_idata, var_names= ['g', 'mu_c', 'mu_m', 'mu_v', 'mu_p', 'sigma'])
plt.show()
Bold = IQR, thin = 94% HDI
az.plot_forest(hierarchical_intercept_idata, var_names= ['g', 'mu_c', 'mu_m', 'mu_v', 'mu_p'])
plt.axvline(0)
plt.savefig('figs/countyForestPlot.png')
plt.show()
PostPred = pd.DataFrame(hierarchical_intercept_idata['theta'], columns=data.FIPS)
y = PostPred.median(0).values
y_l95 = np.percentile(PostPred,2.5,axis=0)
y_u95 = np.percentile(PostPred,97.5,axis=0)
PostPredDf = pd.DataFrame([data['FIPS'], y, y_l95, y_u95, SW, logCP, M, V, P]).T
PostPredDf.columns = ['FIPS', 'pred', 'y_195', 'y_u95', 'SW', 'logCP', 'M', 'V', 'P']
PostPredDf
PostPredDf.to_csv('postPredCounty.csv')
xnew = np.linspace(-6, 4, 100)
az.style.use("arviz-darkgrid")
plt.scatter(logCP, SW)
[plt.plot(xnew, b0+b1*xnew+b2*xnew+b3*xnew+b4*xnew, alpha=0.008, c='black') for b0,b1,b2,b3,b4 in zip(hierarchical_intercept_idata['a_state'].T[1], hierarchical_intercept_idata['mu_c'], hierarchical_intercept_idata['mu_m'], hierarchical_intercept_idata['mu_v'], hierarchical_intercept_idata['mu_p'])]
plt.xlabel("Log COVID cases per capita")
plt.ylabel("Swing for Biden")
plt.savefig('figs/countyPosteriorCovidLines.png')
plt.show()
# Number of values
N = 3103
# Range of positive standardized population
xnew = np.linspace(-6,4,N)
with hierarchical_intercept:
# Low contact predictions
pm.set_data({'Ic':np.array([0]*N), 'P':xnew})
ynew = pm.sample_posterior_predictive(hierarchical_intercept_idata, var_names=['y'])['y']
# Grab trendlines
ymu = ynew.mean(0)
az.plot_hdi(xnew, ynew, color="b", fill_kwargs={"alpha": 0.5})
plt.plot(xnew, ymu, color="b", alpha=0.7, label='Model prediction (94% HDI)')
plt.scatter(logCP, SW, c='grey', alpha=0.3, label='Expected values')
az.style.use("arviz-darkgrid")
plt.xlabel("Log COVID cases per capita")
plt.ylabel("Swing for Biden")
plt.legend(loc='lower left')
plt.savefig('figs/countyPosteriorPoints.png')
plt.show()
error = np.abs(PostPredDf.y_195 - PostPredDf.y_u95)
plt.errorbar(PostPredDf['logCP'], PostPredDf['pred'], yerr=error, c='grey', fmt='o', alpha=0.3, label='Model Predictions')
plt.scatter(PostPredDf['logCP'], PostPredDf['SW'], c='black', label='Expected')
plt.xlabel("Log COVID cases per capita")
plt.ylabel("Swing for Biden")
az.style.use('arviz-darkgrid')
plt.legend(loc='lower left')
plt.savefig('figs/countyPosteriorIndividualPoints.png')
with pm.Model(coords = {'State': State, 'County': FIPS}) as state_slopes_model:
# Hyperpriors for intercept
g = pm.Normal("g", mu=0.0, sigma=3.5, shape=2)
sigma_a = pm.Exponential("sigma_a", 1.0)
# hyperpriors for slopes
mu_c = pm.Normal("mu_c", mu=0.0, sigma=0.5)
sigma_c = pm.Exponential("sigma_c", 1)
mu_m = pm.Normal('mu_m', 0, 0.5)
sigma_m = pm.Exponential('sigma_m', 1)
mu_v = pm.Normal('mu_v', 0, 0.5)
sigma_v = pm.Exponential('sigma_v', 1)
mu_p = pm.Normal('mu_p', 0, 0.5)
sigma_p = pm.Exponential('sigma_p', 1)
# Varying intercepts (state):
mu_a = pm.Deterministic("a", g[0] + g[1] * G, dims="State")
za_state = pm.Normal("za_state", mu=0.0, sigma=1.0, dims="State")
a_state = pm.Deterministic("a_state", mu_a + za_state * sigma_a, dims="State")
#varying slopes (state)
zb_c = pm.Normal("zb_c", mu=0.0, sigma=1, dims="State")
b_c = pm.Deterministic("b_c", mu_c + zb_c * sigma_c, dims="State")
zb_m = pm.Normal("zb_m", mu=0.0, sigma=1, dims="State")
b_m = pm.Deterministic("b_m", mu_m + zb_m * sigma_m, dims="State")
zb_v = pm.Normal("zb_v", mu=0.0, sigma=1, dims="State")
b_v = pm.Deterministic("b_v", mu_v + zb_v * sigma_v, dims="State")
zb_p = pm.Normal("zb_p", mu=0.0, sigma=1, dims="State")
b_p = pm.Deterministic("b_p", mu_p + zb_p * sigma_p, dims="State")
# Expected value per county:
theta = pm.Deterministic('theta', a_state[Is] + b_c[Is] * CP + b_m[Is] * M + b_v[Is] * V + b_p[Is] * P)
# Model error:
sigma = pm.Exponential("sigma", 1.0)
y = pm.Normal("y", theta, sigma=sigma, observed=SW, dims = "County")
ppd = pm.sample_prior_predictive(200, model=state_slopes_model)
_, ax = plt.subplots(2,2, figsize=(10,10))
xnew = np.linspace(-6, 6, 50)
ax[0,0].scatter(logCP, SW)
[ax[0,0].plot(xnew, b0+b1*xnew, alpha=0.3, c='black') for b0,b1 in zip(ppd['a_state'].T[1], ppd['b_c'].T[1])]
ax[0,0].set_title('logCOVID vs Biden swing', fontsize=16)
ax[0,0].axhline(np.max(SW), color='r')
ax[0,0].axhline(np.min(SW), color='r')
ax[0,0].set_ylim(-20, 20)
ax[0,1].scatter(M, SW)
[ax[0,1].plot(xnew, b0+b1*xnew, alpha=0.3, c='black') for b0,b1 in zip(ppd['a_state'].T[1], ppd['b_m'].T[1])]
ax[0,1].set_title('Minorities vs Biden swing', fontsize=16)
ax[0,1].axhline(np.max(SW), color='r')
ax[0,1].axhline(np.min(SW), color='r')
ax[0,1].set_ylim(-20, 20)
ax[1,0].scatter(V, SW)
[ax[1,0].plot(xnew, b0+b1*xnew, alpha=0.3, c='black') for b0,b1 in zip(ppd['a_state'].T[1],ppd['b_v'].T[1])]
ax[1,0].set_title('Volatility vs Biden swing', fontsize=16)
ax[1,0].axhline(np.max(SW), color='r')
ax[1,0].axhline(np.min(SW), color='r')
ax[1,0].set_ylim(-20, 20)
ax[1,1].scatter(P, SW)
[ax[1,1].plot(xnew, b0+b1*xnew, alpha=0.3, c='black') for b0,b1 in zip(ppd['a_state'].T[1],ppd['b_p'].T[1])]
ax[1,1].set_title('Presidential2016 vs Biden swing', fontsize=16)
ax[1,1].axhline(np.max(SW), color='r')
ax[1,1].axhline(np.min(SW), color='r')
ax[1,1].set_ylim(-20, 20)
plt.tight_layout()
plt.show()
with state_slopes_model:
state_slopes_trace = pm.sample(1000, target_accept = 0.99)
PostPred = pd.DataFrame(state_slopes_trace['theta'], columns=data.FIPS)
y = PostPred.median(0).values
y_l95 = np.percentile(PostPred,2.5,axis=0)
y_u95 = np.percentile(PostPred,97.5,axis=0)
PostPredDf = pd.DataFrame([data['FIPS'], y, y_l95, y_u95, SW, logCP, M, V, P]).T
PostPredDf.columns = ['FIPS', 'pred', 'y_195', 'y_u95', 'SW', 'logCP', 'M', 'V', 'P']
PostPredDf.to_csv('postPredState.csv')
medianCovidSlope = np.mean(state_slopes_trace['b_c'], axis = 0)
stateSlopeDf = pd.DataFrame.from_dict({'state':State, 'meanCovidSlope':medianCovidSlope})
stateSlopeDf
stateSlopeDf.to_csv('stateCovidSlope.csv')
az.plot_forest(state_slopes_trace, var_names= ['g', 'mu_c', 'mu_m', 'mu_v', 'mu_p'])
plt.axvline(0)
plt.savefig('figs/stateForestPlot.png')
plt.show()
az.plot_forest(state_slopes_trace, var_names= 'b_c')
plt.axvline(0)
plt.savefig('figs/stateForestPlotCovidIndividStates.png')
plt.show()
az.plot_trace(state_slopes_trace, var_names= ['g', 'mu_c', 'mu_m', 'mu_v', 'mu_p', 'sigma'])
plt.show()